# Web Scraping for used vehicle information (Autotrader website)
This tool has evolved from Jean-Nicholas Hould's Web Scraping for Beers web page (http://www.jeannicholashould.com/python-web-scraping-tutorial-for-craft-beers.html)

In [7]:
#routine to import required Python libraries
from urllib.request import Request,urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
 
# Python 3.6 :: Anaconda 4.3.1 (x86_64)
# pandas==0.19.2
# beautifulsoup4==4.5.3


In [8]:
# Determines if a supplied textstring (httpstring) is a valid URL
def is_valid_URL(httpstring):
    try:
        html = urlopen(httpstring)
    except IOError:
        weblink= False
    else:
        weblink = True
              
    return ( weblink)

In [9]:
# Determines whether there are a specific number of <li> tags in current html block
# Checks whether there is more vehicle attribute data to obtain
def index_valid(li_tag,check_index):
    if len(li_tag)>check_index :
        iv = True
    else:
        iv = False
    return (iv)

In [10]:
def get_all_vehicles():
# specify make and model(s) to analyse
    entermake = "VAUXHALL"
#    entermodel = ["AGILA","AMPERA", "ANTARA", "CALIBRA", "CASCADA", "CAVALIER", "COMBO%20TOUR","CROSSLAND%20X","FRONTERA","GRANDLAND%20X","LOTUS%20CARLTON","MALOO", "MONARO","NOVA","OMEGA","SENATOR","SIGNUM","TIGRA","VECTRA","VIVA","VIVARO","VX220","VXR8"]
#    entermodel = ["ADAM", "ASTRA", "ASTRA%20GTC", "CORSA","INSIGNIA","MERIVA","MOKKA","MOKKA%20X","ZAFIRA","ZAFIRA%20TOURER"]
    entermodel =["AGILA"]
# specify array values to segment vehicle mileage ranges.  The values are arbitrary but help categorisation of vehicles.  
# Autotrader only provides data for first 100 pages so segmentation helps capture details of all selected vehicles
    mileage = [0,10000] #,500,5000,10000,15000,20000,25000,30000,35000,40000,45000,50000,60000,70000,80000,90000,100000,125000,150000,200000]
    vehicles = []
# loop through all models specified in array 'entermodel'
    for imodel in range(0,len(entermodel)):
# loop through all mileage ranges in array 'mileage'
        for imileage in range(0,len(mileage)-1):
# build URL from stem and parameters make, model and mileage.  Search centred on Leceister postcode
            httpstring = "https://www.autotrader.co.uk/car-search?sort=mileage&radius=1500&postcode=le11tg&make=" + entermake + "&model=" + entermodel[imodel] + "&minimum-mileage=" + str(mileage[imileage]) + "&maximum-mileage=" + str(mileage[imileage+1])
            html = urlopen(httpstring)
            html_soup = BeautifulSoup(html, 'html.parser')
# identify the number of webpages for selected make, model and mileage from label 'paginationMini__count'.  Page number in html tag 'strong'
            single_page_tag = html_soup.find_all("li",{'class': 'paginationMini__count'})
            for page_tag in single_page_tag:
                strong_tags = page_tag.find_all("strong")
            pagetotal = int(strong_tags[1].get_text())
            print("page ", pagetotal)
            if pagetotal > 101:
                pagetotal = 100
# loop through all website pages
            for i in range(1,pagetotal+1):

                httpstring = "https://www.autotrader.co.uk/car-search?sort=mileage&radius=1500&postcode=le11tg&make=" + entermake + "&model=" + entermodel[imodel]  + "&minimum-mileage=" + str(mileage[imileage]) + "&maximum-mileage=" + str(mileage[imileage+1]) + "&page="  + str(i)
                print(i,httpstring)
# include a delay between webpage requests
                time.sleep(1)
# check text string is a valid URL 
                if is_valid_URL(httpstring):
                    html = urlopen(httpstring)
                    html_soup = BeautifulSoup(html, 'html.parser')
# the details of each vehicle matching criteria are held in html tag with label 'search-listing' 
                    all_div_tags = html_soup.find_all("article",{'class': 'search-listing'})
# set default values for vehicle information 
                    for h2_tags in all_div_tags:
                        vehicle_title = ""
                        vehicle_year = ""
                        vehicle_style = ""
                        vehicle_mileage = ""
                        vehicle_drive = ""
                        vehicle_size = "" 
                        vehicle_bhp = ""
                        vehicle_fuel = ""
                        vehicle_descrip = ""
                        vehicle_price = ""
                        if h2_tags.h2 != None:
                            if h2_tags.h2.a != None:
                                vehicle_title = h2_tags.h2.a.text

# each vehicle's details are held in html tag with label 'listing-key-specs' 
                        single_ul_tag = h2_tags.find_all("ul",{'class': 'listing-key-specs'})
                        for ul_tag in single_ul_tag:
                            li_tags = ul_tag.find_all("li")
# vehicle attributes are held in 'li' html tags.  The amount of data available differs for each vehicle and the attributes are not systematically ordered
# therefore a crude approach has been taken to store each available attribute (for data munging later)
                        vehicle_year = li_tags[0].get_text()
                        if index_valid(li_tags,1):
                            vehicle_style = li_tags[1].get_text()
                            if index_valid(li_tags,2):
                                vehicle_mileage = li_tags[2].get_text()
                                if index_valid(li_tags,3):
                                    vehicle_drive = li_tags[3].get_text()
                                    if index_valid(li_tags,4):
                                        vehicle_size = li_tags[4].get_text()
                                        if index_valid(li_tags,5):
                                            vehicle_bhp = li_tags[5].get_text()
                                            if index_valid(li_tags,6) :
                                                vehicle_fuel = li_tags[6].get_text()

# capture description details from free text field 
                        desc_tag = h2_tags.find("p",{'class': 'listing-description'})
                        if desc_tag !=None:
                            vehicle_descrip = desc_tag.text
# capture vehicle price from html tag 'price-column'
                        price_tag = h2_tags.find("section",{'class': 'price-column'})
                        if price_tag !=None:
                            if price_tag.a !=None:
                                vehicle_price = price_tag.a.find("div",{'class': 'vehicle-price'}).text
# create and add record from attribute fields read 
                        vehicle_list = {
                            "vehicletitle" : vehicle_title,
                            "vehicleyear" : vehicle_year,
                            "vehiclestyle" : vehicle_style,
                            "vehiclemileage" : vehicle_mileage,
                            "vehicledrive" : vehicle_drive,
                            "vehiclesize" : vehicle_size,
                            "vehiclebhp" : vehicle_bhp,
                            "vehiclefuel" : vehicle_fuel,
                            "vehicledescription" : vehicle_descrip,
                            "vehicleprice" : vehicle_price
                        }        
                        vehicles.append(vehicle_list)

    return vehicles

In [11]:
# create a structured Panda dataframe containing the vehicle details
vehicle_list = get_all_vehicles()    
vl = pd.DataFrame(vehicle_list)
vl

page  3
1 https://www.autotrader.co.uk/car-search?sort=mileage&radius=1500&postcode=le11tg&make=VAUXHALL&model=AGILA&minimum-mileage=0&maximum-mileage=10000&page=1
2 https://www.autotrader.co.uk/car-search?sort=mileage&radius=1500&postcode=le11tg&make=VAUXHALL&model=AGILA&minimum-mileage=0&maximum-mileage=10000&page=2
3 https://www.autotrader.co.uk/car-search?sort=mileage&radius=1500&postcode=le11tg&make=VAUXHALL&model=AGILA&minimum-mileage=0&maximum-mileage=10000&page=3


Unnamed: 0,vehiclebhp,vehicledescription,vehicledrive,vehiclefuel,vehiclemileage,vehicleprice,vehiclesize,vehiclestyle,vehicletitle,vehicleyear
0,92bhp,This Price excludes any other offer and includ...,Manual,Petrol,"6,747 miles","£6,699",1.2L,Hatchback,Vauxhall Agila Se 1.3 5dr,2014
1,92 bhp,"Privately used, one owner car with Vauxhall Se...",Manual,Petrol,"1,000 miles","£6,795",1.2L,Hatchback,Vauxhall Agila 1.2 SE 5DR,2014 (64 reg)
2,92 bhp,"""IMMEDIATE DRIVE AWAY!"" Black, ***IJC CAR SALE...",Automatic,Petrol,"1,059 miles","£8,995",1.2L,Hatchback,Vauxhall Agila 1.2 i ecoFLEX 16v SE 5dr (a/c),2014 (64 reg)
3,92 bhp,"WHITE, The UK's first and best national Automa...",Automatic,Petrol,"2,000 miles","£6,995",1.2L,Hatchback,Vauxhall Agila SE 1.3 5dr,2013 (63 reg)
4,92 bhp,"Metalic Silver, £6,182, Call 01476 539510, Fu...",Manual,Petrol,"2,324 miles","£6,182",1.2L,Hatchback,Vauxhall Agila 1.2 VVT ecoFLEX SE 5dr,2014 (64 reg)
5,92 bhp,"Brown, £7,500, Very low mileage, Full service ...",Automatic,Petrol,"2,752 miles","£7,500",1.2L,Hatchback,Vauxhall Agila 1.2 VVT SE 5dr Auto,2014 (14 reg)
6,92 bhp,"Macadamia Metallic, £6,562, Call 01234 245823,...",Manual,Petrol,"3,684 miles","£6,562",1.2L,Hatchback,Vauxhall Agila 1.2 VVT ecoFLEX SE 5dr,2014 (64 reg)
7,92 bhp,"Moroccan Blue, £6,227, Call 01752 825545, *On...",Manual,Petrol,"3,936 miles","£6,227",1.2L,Hatchback,Vauxhall Agila 1.2 VVT ecoFLEX SE 5dr,2014 (14 reg)
8,92 bhp,all vehicles are fully serviced with a 12 mont...,Automatic,Petrol,"4,000 miles","£7,995",1.2L,Hatchback,Vauxhall Agila 1.2 i ecoFLEX 16v SE 5dr (a/c),2014 (64 reg)
9,92 bhp,"Retail price £5995,with £500 minimum part exch...",Manual,Petrol,"4,217 miles","£5,495",1.2L,Hatchback,Vauxhall Agila 1.2 SE 5d 93 BHP,2013 (13 reg)


In [12]:
# output re-ordered Panda dataframe to a CSV file
vlnew = vl[['vehicletitle','vehicleyear','vehiclestyle','vehiclemileage','vehicledrive','vehiclesize','vehiclebhp','vehiclefuel','vehicledescription','vehicleprice']]
vlnew.to_csv('Vehicles_Vauxhall_2017.csv')
vlnew.iloc[0,0]

'Vauxhall Agila Se 1.3 5dr'